##Libraries
library(tidyverse)
library(data.table)
library(matrixStats)
library(lubridate)
library(patchwork)
library(ggraph)
library(igraph)
library(ape)
Load data
# The raw sequence data is available as part of the COG-UK data set (PRJEB37886).
# However, the association between sequences and metadata has been retained to
# protect the privacy of patients. To request access please contact the authors.
# msa <- read.dna('./data/alignment_carehome_20200720.fasta', format = 'fasta')
meta <- fread("./data/carehome_clustering_input_metadata_20200720.csv") %>% as_tibble()
meta$collection_date_filled <- as.Date(meta$collection_date_filled, format = "%d/%m/%y")
write.csv(meta[, 1:2], file = "./processed_data/dates.csv", quote = FALSE, row.names = FALSE)
run the transcluster algorithm with liberal thresholds initially
fasttranscluster -o processed_data --save_probs --msa data/alignment_carehome_20200720.fasta --dates processed_data/dates.csv -K 20 --snp_threshold 15 -t 4
fasttranscluster -o processed_data_check_cog --save_probs --msa ./data/check_with_cog/carehome_public_genomes_align_update.fasta --dates processed_data/dates.csv -K 20 --snp_threshold 15 -t 4
import pyfastx
headers = []
for name, seq in pyfastx.Fasta('data/alignment_carehome_20200720.fasta', build_index=False):
keep_seq = seq
headers.append(name)
with open('./processed_data_identical/identical.fasta' ,'w') as outfile:
for h in headers:
outfile.write(">"+h+"\n"+keep_seq+"\n")
fasttranscluster -o processed_data_identical --save_probs --msa ./processed_data_identical/identical.fasta --dates processed_data/dates.csv -K 20 --snp_threshold 15 -t 4
trans <- fread("./processed_data/transcluster_probabilities.csv") %>% as_tibble()
trans$care_home_A <- meta$anonymised_care_home_code[match(trans$sampleA, meta$anonymised_sample_id)]
trans$care_home_B <- meta$anonymised_care_home_code[match(trans$sampleB, meta$anonymised_sample_id)]
trans$dateA <- meta$collection_date_filled[match(trans$sampleA, meta$anonymised_sample_id)]
trans$dateB <- meta$collection_date_filled[match(trans$sampleB, meta$anonymised_sample_id)]
trans$date_diff <- abs(trans$dateA - trans$dateB)
all_care_homes <- unique(meta$anonymised_care_home_code)
nintrodictions <- map_dfr(seq(0, 1, 0.005), function(prob) {
nintros <- map_dbl(all_care_homes, ~{
temp_trans <- trans %>% filter(care_home_A == .x) %>% filter(care_home_B ==
.x)
temp_samples <- meta$anonymised_sample_id[meta$anonymised_care_home_code ==
.x]
if (length(temp_samples) < 2)
return(1)
d <- matrix(0, nrow = length(temp_samples), ncol = length(temp_samples),
dimnames = list(temp_samples, temp_samples))
d[as.matrix(temp_trans[, c("sampleA", "sampleB")])] <- exp(apply(temp_trans[,
c("0", "1", "2")], 1, logSumExp))
d[as.matrix(temp_trans[, c("sampleB", "sampleA")])] <- d[as.matrix(temp_trans[,
c("sampleA", "sampleB")])]
diag(d) <- 1
h <- hclust(as.dist(1 - d), method = "single")
clust <- cutree(h, h = 1 - prob)
return(length(unique(clust)))
})
tibble(cutoff = prob, nintros = list(nintros))
})
nintrodictions$max_intros <- map_dbl(nintrodictions$nintros, max)
nintrodictions$mean_intros <- map_dbl(nintrodictions$nintros, mean)
nintrodictions$total_intros <- map_dbl(nintrodictions$nintros, sum)
ggplot(nintrodictions, aes(x = cutoff, y = total_intros)) + geom_point() + theme_bw(base_size = 12) +
geom_vline(xintercept = 0.15, color = "red") + xlab("cutoff probability \n minimum probability cases are seperated by at most 2 intermediate hosts") +
ylab("total number of clusters across all care homes")
ggsave("./Figures/num_clusters_by_threshold.png", width = 12, height = 7)
ggsave("./Figures/num_clusters_by_threshold.pdf", width = 12, height = 7)
## minimum probability of a transmission with at most 2 intermediate hosts
MIN_PROB <- 0.15
trans_with_clusters <- map_dfr(all_care_homes, ~{
temp_trans <- trans %>% filter(care_home_A == .x) %>% filter(care_home_B == .x)
temp_samples <- meta$anonymised_sample_id[meta$anonymised_care_home_code == .x]
if ((length(temp_samples) < 2) | (nrow(temp_trans) < 1))
return(tibble())
d <- matrix(0, nrow = length(temp_samples), ncol = length(temp_samples), dimnames = list(temp_samples,
temp_samples))
d[as.matrix(temp_trans[, c("sampleA", "sampleB")])] <- exp(apply(temp_trans[,
c("0", "1", "2")], 1, logSumExp))
d[as.matrix(temp_trans[, c("sampleB", "sampleA")])] <- d[as.matrix(temp_trans[,
c("sampleA", "sampleB")])]
diag(d) <- 1
h <- hclust(as.dist(1 - d), method = "single")
clust <- cutree(h, h = 1 - MIN_PROB)
temp_trans$clustA <- paste(.x, clust[temp_trans$sampleA], sep = "_")
temp_trans$clustB <- paste(.x, clust[temp_trans$sampleB], sep = "_")
return(temp_trans %>% filter(clustA == clustB))
})
plot histograms of resulting SNP and date differences between inferred links
ggplot(trans_with_clusters, aes(x = snp_distance)) + geom_bar(fill = "#4682B4") +
theme_bw(base_size = 14) + xlab("SNP distance")
ggsave("./Figures/snp_dist_histo.png", width = 9, height = 7)
ggsave("./Figures/snp_dist_histo.pdf", width = 9, height = 7)
ggplot(trans_with_clusters, aes(x = date_diff)) + geom_bar(fill = "#4682B4") + theme_bw(base_size = 14) +
xlab("date difference")
ggsave("./Figures/date_dist_histo.png", width = 9, height = 7)
ggsave("./Figures/date_dist_histo.pdf", width = 9, height = 7)
plotdf <- trans_with_clusters
plotdf$cluster <- factor(plotdf$clustA, levels = unique(names(sort(map_dbl(split(trans_with_clusters$date_diff,
trans_with_clusters$clustA), median), decreasing = TRUE))))
ggplot(plotdf, aes(x = cluster, y = date_diff, group = clustA)) + geom_boxplot(fill = "#4682B4",
outlier.colour = NA) + geom_point() + theme_bw(base_size = 14) + ylab("date difference") +
xlab("cluster")
ggsave("./Figures/date_dist_boxplot.png", width = 9, height = 7)
ggsave("./Figures/date_dist_boxplot.pdf", width = 9, height = 7)
plot number of samples versus number of clusters by care home
clusters <- map_dfr(all_care_homes, ~{
temp_trans <- trans %>% filter(care_home_A == .x) %>% filter(care_home_B == .x)
temp_samples <- meta$anonymised_sample_id[meta$anonymised_care_home_code == .x]
if ((length(temp_samples) < 2) | (nrow(temp_trans) < 1))
return(tibble(home = .x, sample = temp_samples, cluster = paste(.x, 1:length(temp_samples),
sep = "_")))
d <- matrix(0, nrow = length(temp_samples), ncol = length(temp_samples), dimnames = list(temp_samples,
temp_samples))
d[as.matrix(temp_trans[, c("sampleA", "sampleB")])] <- exp(apply(temp_trans[,
c("0", "1", "2")], 1, logSumExp))
d[as.matrix(temp_trans[, c("sampleB", "sampleA")])] <- d[as.matrix(temp_trans[,
c("sampleA", "sampleB")])]
diag(d) <- 1
h <- hclust(as.dist(1 - d), method = "single")
clust <- cutree(h, h = 1 - MIN_PROB)
missing <- temp_samples[!temp_samples %in% names(clust)]
return(tibble(home = .x, sample = c(names(clust), missing), cluster = c(paste(.x,
clust, sep = "_"), paste(rep(.x, length(missing)), max(clust) + seq_along(missing),
sep = "_"))))
})
# write out clusters
write.csv(clusters, file = "./processed_data/final_clusters.csv", quote = FALSE,
row.names = FALSE)
nintros_by_home <- clusters %>% group_by(home) %>% summarise(n_samples = n(), n_intros = length(unique(cluster))) %>%
arrange(n_samples)
nintros_by_home$home <- factor(nintros_by_home$home, levels = unique(nintros_by_home$home))
sum(nintros_by_home$n_intros)
## [1] 409
plotdf <- nintros_by_home %>% group_by(n_samples, n_intros) %>% summarise(count = n())
ggplot(plotdf, aes(x = n_samples, y = n_intros, size = count)) + geom_point() + theme_bw(base_size = 14) +
xlab("number of samples") + ylab("number of clusters") + scale_size_continuous(name = "care home count")
ggsave("./Figures/nsamples_vs_nclusters.png", width = 9, height = 7)
ggsave("./Figures/nsamples_vs_nclusters.pdf", width = 9, height = 7)
plot the more interesting networks with at least 7 samples. I am not sure how informative these are.
interesting_homes <- as.character(unique(nintros_by_home$home[nintros_by_home$n_samples >=
2]))
vertex_attributes <- clusters[, c(2, 1, 3)] %>% filter(home %in% interesting_homes)
edge_attributes <- trans_with_clusters %>% filter(sampleA %in% vertex_attributes$sample) %>%
filter(sampleB %in% vertex_attributes$sample)
p <- exp(apply(edge_attributes[, c("0", "1", "2")], 1, logSumExp))
edge_attributes <- edge_attributes[, c("sampleA", "sampleB", "snp_distance", "date_diff")]
edge_attributes$home <- meta$anonymised_care_home_code[match(edge_attributes$sampleA,
meta$anonymised_sample_id)]
edge_attributes$`transmission probability` <- p
edge_attributes <- edge_attributes %>% filter(`transmission probability` >= MIN_PROB)
graph <- igraph::graph_from_data_frame(edge_attributes, vertices = vertex_attributes)
ggraph(graph, layout = "kk") + geom_edge_fan(aes(colour = `transmission probability`)) +
geom_node_point(size = 3) + scale_color_brewer(type = "qual", palette = 3) +
theme_graph(foreground = "steelblue", fg_text_colour = "white", base_size = 14) +
facet_wrap(~home, scales = "free", ncol = 7) + scale_edge_colour_gradientn(colours = c("#313695",
"#ffffbf", "#a50026"))
ggsave("./Figures/interesting_transmission_networks.png", width = 15, height = 9)
ggsave("./Figures/all_transmission_networks.pdf", width = 12, height = 25, device = cairo_pdf)
##Healthcare worker analysis
Load data
# The raw sequence data is available as part of the COG-UK data set (PRJEB37886).
# However, the association between sequences and metadata has been retained to
# protect the privacy of patients. To request access please contact the authors.
# hcw_msa <- read.dna('./data/alignment_cuh_hcw_care_20200723.fasta', format =
# 'fasta')
hcw_meta <- fread("./data/cuh_hcw_care_cluster_analysis_set_20200723.csv") %>% as_tibble()
hcw_meta$collection_date_filled <- as.Date(hcw_meta$collection_date_filled, format = "%d/%m/%y")
write.csv(hcw_meta[, 1:2], file = "./processed_data/hcw_dates.csv", quote = FALSE,
row.names = FALSE)
run the transcluster algorithm with liberal thresholds initially
mkdir processed_data/hcw
fasttranscluster -o processed_data/hcw --save_probs --msa data/alignment_cuh_hcw_care_20200723.fasta --dates processed_data/hcw_dates.csv -K 20 --snp_threshold 15 -t 4
hcw_trans <- fread("./processed_data/hcw/transcluster_probabilities.csv") %>% as_tibble()
hcw_trans$care_home_A <- hcw_meta$anonymised_care_home_code[match(hcw_trans$sampleA,
hcw_meta$anonymised_sample_id)]
hcw_trans$care_home_B <- hcw_meta$anonymised_care_home_code[match(hcw_trans$sampleB,
hcw_meta$anonymised_sample_id)]
hcw_trans$dateA <- hcw_meta$collection_date_filled[match(hcw_trans$sampleA, hcw_meta$anonymised_sample_id)]
hcw_trans$dateB <- hcw_meta$collection_date_filled[match(hcw_trans$sampleB, hcw_meta$anonymised_sample_id)]
hcw_trans$date_diff <- abs(hcw_trans$dateA - hcw_trans$dateB)
inital clustering of all individuals
temp_samples <- hcw_meta$anonymised_sample_id
d <- matrix(0, nrow = length(temp_samples), ncol = length(temp_samples), dimnames = list(temp_samples,
temp_samples))
d[as.matrix(hcw_trans[, c("sampleA", "sampleB")])] <- exp(apply(hcw_trans[, c("0",
"1", "2")], 1, logSumExp))
d[as.matrix(hcw_trans[, c("sampleB", "sampleA")])] <- d[as.matrix(hcw_trans[, c("sampleA",
"sampleB")])]
diag(d) <- 1
h <- hclust(as.dist(1 - d), method = "single")
clust <- cutree(h, h = 1 - MIN_PROB)
missing <- temp_samples[!temp_samples %in% names(clust)]
all_clust <- tibble(sample = c(names(clust), missing), cluster = c(paste("CH_HCW",
clust, sep = "_"), paste(rep("CH_HCW", length(missing)), max(clust) + seq_along(missing),
sep = "_")))
all_clust$hcw_status <- hcw_meta$cuh_hcw_care_status[match(all_clust$sample, hcw_meta$anonymised_sample_id)]
write.csv(all_clust, file = "./processed_data/final_hcw_clustering_combined.csv",
quote = FALSE, row.names = FALSE)
plot all on one big graph
vertex_attributes <- hcw_meta
edge_attributes <- hcw_trans
p <- exp(apply(edge_attributes[, c("0", "1", "2")], 1, logSumExp))
edge_attributes <- edge_attributes[, c("sampleA", "sampleB", "snp_distance", "date_diff")]
edge_attributes$home <- hcw_meta$anonymised_care_home_code[match(edge_attributes$sampleA,
hcw_meta$anonymised_sample_id)]
edge_attributes$home[edge_attributes$home == ""] <- "HCW"
edge_attributes$`transmission probability` <- p
edge_attributes <- edge_attributes %>% filter(`transmission probability` >= MIN_PROB)
vertex_attributes$status <- ifelse(vertex_attributes$cuh_hcw_care_status == "hcw",
"health care worker", "care home resident")
hcws <- hcw_meta$anonymised_sample_id[hcw_meta$cuh_hcw_care_status == "hcw"]
edge_attributes$involves_hcw <- (edge_attributes$sampleA %in% hcws) | (edge_attributes$sampleB %in%
hcws)
graph <- igraph::graph_from_data_frame(edge_attributes, vertices = vertex_attributes)
ggraph(graph, layout = "kk") + geom_edge_fan(aes(colour = `transmission probability`,
alpha = involves_hcw)) + geom_node_point(aes(colour = status), size = 3) + scale_color_brewer(type = "qual",
palette = 3) + theme_graph(foreground = "steelblue", fg_text_colour = "white",
base_size = 14) + scale_color_manual(values = c("#377eb8", "#4daf4a")) + scale_edge_alpha_manual(values = c(1,
0.2), guide = FALSE) + scale_edge_colour_gradientn(colours = c("#313695", "#ffffbf",
"#a50026"))
ggsave("./Figures/hcw_transmission_networks_combined.png", width = 15, height = 9)
cluster into plausible transmission chains seperated by care home allowing HCW’s to appear in multiple care homes.
hcw_clusters <- map_dfr(all_care_homes, ~{
temp_trans <- hcw_trans %>% filter((care_home_A == .x) | (care_home_A == "")) %>%
filter((care_home_B == .x) | (care_home_B == "")) %>% filter(!((care_home_A ==
"") & (care_home_B == "")))
if (nrow(temp_trans) < 1)
return(tibble())
temp_samples <- hcw_meta$anonymised_sample_id[hcw_meta$anonymised_care_home_code %in%
c(.x, "")]
if ((length(temp_samples) < 2) | (nrow(temp_trans) < 1))
return(tibble(home = .x, sample = temp_samples, cluster = paste(.x, 1:length(temp_samples),
sep = "_")))
d <- matrix(0, nrow = length(temp_samples), ncol = length(temp_samples), dimnames = list(temp_samples,
temp_samples))
d[as.matrix(temp_trans[, c("sampleA", "sampleB")])] <- exp(apply(temp_trans[,
c("0", "1", "2")], 1, logSumExp))
d[as.matrix(temp_trans[, c("sampleB", "sampleA")])] <- d[as.matrix(temp_trans[,
c("sampleA", "sampleB")])]
diag(d) <- 1
h <- hclust(as.dist(1 - d), method = "single")
clust <- cutree(h, h = 1 - MIN_PROB)
missing <- temp_samples[!temp_samples %in% names(clust)]
temp_clust <- tibble(home = .x, sample = c(names(clust), missing), cluster = c(paste(.x,
clust, sep = "_"), paste(rep(.x, length(missing)), max(clust) + seq_along(missing),
sep = "_")))
temp_clust$hcw_status <- hcw_meta$cuh_hcw_care_status[match(temp_clust$sample,
hcw_meta$anonymised_sample_id)]
# only keep those clusters with both HCW and carehome residents
keep <- temp_clust %>% group_by(cluster) %>% summarise(ncat = length(unique(hcw_status))) %>%
filter(ncat > 1)
return(temp_clust %>% filter(cluster %in% keep$cluster))
})
# write out clusters
write.csv(hcw_clusters, file = "./processed_data/final_hcw_clusters.csv", quote = FALSE,
row.names = FALSE)
plot the networks that contain both HCW and carehome residents. HCW can be included in multiple clusters as we split by carehome.
interesting_homes <- unique(hcw_clusters$home)
vertex_attributes <- hcw_clusters[, c(2, 1, 3, 4)] %>% filter(home %in% interesting_homes)
# relabel healthcare worker to give them a unique ID per carehome
vertex_attributes$sample <- imap_chr(vertex_attributes$sample, ~{
if (vertex_attributes$hcw_status[[.y]] == "hcw") {
paste(.x, vertex_attributes$home[[.y]], sep = "_")
} else {
.x
}
})
edge_attributes <- hcw_trans %>% filter(sampleA %in% hcw_clusters$sample) %>% filter(sampleB %in%
hcw_clusters$sample)
hcws <- hcw_meta$anonymised_sample_id[hcw_meta$cuh_hcw_care_status == "hcw"]
edge_attributes <- map_dfr(interesting_homes, ~{
temp <- edge_attributes
temp$sampleA[temp$sampleA %in% hcws] <- paste(temp$sampleA[temp$sampleA %in%
hcws], .x, sep = "_")
temp$sampleB[temp$sampleB %in% hcws] <- paste(temp$sampleB[temp$sampleB %in%
hcws], .x, sep = "_")
return(temp)
})
p <- exp(apply(edge_attributes[, c("0", "1", "2")], 1, logSumExp))
edge_attributes <- edge_attributes[, c("sampleA", "sampleB", "snp_distance", "date_diff")]
edge_attributes$home <- hcw_meta$anonymised_care_home_code[match(edge_attributes$sampleA,
hcw_meta$anonymised_sample_id)]
edge_attributes$home[is.na(edge_attributes$home)] <- gsub(".*_", "", edge_attributes$sampleA[is.na(edge_attributes$home)])
edge_attributes$`transmission probability` <- p
edge_attributes <- edge_attributes %>% filter(`transmission probability` >= MIN_PROB)
edge_attributes <- edge_attributes %>% filter((sampleA %in% vertex_attributes$sample) &
(sampleB %in% vertex_attributes$sample))
edge_attributes$homeA <- hcw_meta$anonymised_care_home_code[match(edge_attributes$sampleA,
hcw_meta$anonymised_sample_id)]
edge_attributes$homeA[is.na(edge_attributes$homeA)] <- gsub(".*_", "", edge_attributes$sampleA[is.na(edge_attributes$homeA)])
edge_attributes$homeB <- hcw_meta$anonymised_care_home_code[match(edge_attributes$sampleB,
hcw_meta$anonymised_sample_id)]
edge_attributes$homeB[is.na(edge_attributes$homeB)] <- gsub(".*_", "", edge_attributes$sampleB[is.na(edge_attributes$homeB)])
edge_attributes <- edge_attributes %>% filter(homeA == homeB)
link_list <- imap_dfr(hcw_meta$anonymised_sample_id[hcw_meta$cuh_hcw_care_status ==
"carehome_resident"], ~{
temp <- edge_attributes %>% filter(sampleA == .x)
tempB <- edge_attributes[, c(2, 1, 3, 4, 5, 6, 8, 7)] %>% filter(sampleB == .x)
colnames(tempB) <- colnames(temp)
temp <- rbind(temp, tempB)
if (nrow(temp) < 1)
return(tibble(resident = .x, care_home = hcw_meta$anonymised_care_home_code[hcw_meta$anonymised_sample_id ==
.x], hcw_links = "", res_links = ""))
return(tibble(resident = .x, care_home = unique(temp$home), hcw_links = paste(unique(paste(gsub("_.*",
"", temp$sampleB[grepl("_", temp$sampleB)]), format(temp$`transmission probability`[grepl("_",
temp$sampleB)], digits = 2), sep = "_")), collapse = ","), res_links = paste(unique(paste(gsub("_.*",
"", temp$sampleB[!grepl("_", temp$sampleB)]), format(temp$`transmission probability`[!grepl("_",
temp$sampleB)], digits = 2), sep = "_")), collapse = ",")))
})
write.table(link_list, file = "./processed_data/hcw_links__collapsed_by_resident.tsv",
sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
edge_attributes <- edge_attributes[!duplicated(edge_attributes[, c(1, 2)]), ]
edge_attributes$involves_hcw <- grepl("_", edge_attributes$sampleA) | grepl("_",
edge_attributes$sampleB)
edge_attributes <- edge_attributes[!(grepl("_", edge_attributes$sampleA) & grepl("_",
edge_attributes$sampleB)), ]
edge_attributes <- edge_attributes %>% arrange(-`transmission probability`)
vertex_attributes <- vertex_attributes %>% filter((sample %in% unique(c(edge_attributes$sampleA,
edge_attributes$sampleB))) | (hcw_status == "carehome_resident"))
vertex_attributes$status <- ifelse(vertex_attributes$hcw_status == "hcw", "health care worker",
"care home resident")
graph <- igraph::graph_from_data_frame(edge_attributes, vertices = vertex_attributes)
ggraph(graph, layout = "kk") + geom_edge_fan(aes(colour = `transmission probability`,
alpha = involves_hcw)) + geom_node_point(aes(colour = status), size = 3) + scale_color_brewer(type = "qual",
palette = 3) + theme_graph(foreground = "steelblue", fg_text_colour = "white",
base_size = 16) + facet_wrap(~home, scales = "free", ncol = 3) + scale_color_manual(values = c("#377eb8",
"#4daf4a")) + scale_edge_alpha_manual(values = c(1, 0.2), guide = FALSE) + scale_edge_colour_gradientn(colours = c("#313695",
"#ffffbf", "#a50026")) + theme(strip.text.x = element_text(size = 14))
ggsave("./Figures/hcw_transmission_networks.png", width = 9, height = 12, device = cairo_pdf)
ggsave("./Figures/hcw_transmission_networks.pdf", width = 9, height = 12, device = cairo_pdf)
edge_attributes$sampleA <- map_chr(str_split(edge_attributes$sampleA, "_"), ~.x[[1]])
edge_attributes$sampleB <- map_chr(str_split(edge_attributes$sampleB, "_"), ~.x[[1]])
write.csv(edge_attributes[, c("sampleA", "sampleB", "snp_distance", "date_diff",
"home", "transmission probability", "involves_hcw")], file = "./processed_data/hcw_transmission_links.csv",
quote = FALSE, row.names = FALSE)